by Andrew Trask
In [14]:
def pretty_print_review_and_label(i):
print(labels[i] + "\t: " + reviews[i][:70] + "...")
g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()
g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()
In [2]:
len(reviews)
Out[2]:
In [5]:
reviews[1]
Out[5]:
In [6]:
labels[1]
Out[6]:
In [15]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)
Counting all the words:
In [126]:
from collections import Counter
import numpy as np
c = Counter()
for review in reviews:
for word in str(review).split():
c[word] += 1
the most common words have no predictive power:
In [125]:
common_words = c.most_common(24)
common_words
Out[125]:
In [88]:
for word in common_words:
del c[word[0]]
c.most_common(10)
Out[88]:
Hmmm.. it would be more useful to have two counters, one for negative reviews and the other for positive ones.
In [127]:
negative_words = Counter()
positive_words = Counter()
total_words = Counter()
for i, review in enumerate(reviews):
for word in str(review).split():
if labels[i] == "NEGATIVE":
negative_words[word] += 1
else:
positive_words[word] += 1
total_words[word] += 1
In [172]:
pos_neg_ratios = Counter()
neg_pos_ratios = Counter()
for word, cnt in total_words.most_common():
if cnt > 500:
pos_neg_ratios[word] += positive_words[word] / (negative_words[word] + 1.0)
neg_pos_ratios[word] += negative_words[word] / (positive_words[word] + 1.0)
pos_neg_ratios.most_common(20)
Out[172]:
In [171]:
neg_pos_ratios.most_common(20)
Out[171]:
In [ ]: